The word clouds for August and November tweets were generated using the steps below:
Note: to generate these word clouds, you must specify the path to a truetype font as a paramenter for the WordCloud class.
Resources:
word-cloud package: http://amueller.github.io/word_cloud/index.htm
Additional code help from http://spartanideas.msu.edu/2014/11/28/turn-your-twitter-timeline-into-a-word-cloud-using-python/
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import Image
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from wordcloud import WordCloud, STOPWORDS
import os.path
import json
import pymongo
In [32]:
def makeWordCloud(df, number_tweets, max_words):
#take random sample of dataframe based on user's input (number_tweets)
rows = np.random.choice(df.index.values, number_tweets, replace=False)
sampled_df = df.ix[rows]
words = ' '.join(sampled_df['text'])
#tokenize into words, take out words less than length 3, stop words, custom stop words, and numbers
tokens = word_tokenize(words)
english_stops = set(stopwords.words('english'))
custom_stops = ['http', 'https', 'amp', 'ferguson', 'via']
tokens = [w.lower() for w in tokens]
words_fin = " ".join([w for w in tokens if len(w) > 2 and w not in english_stops and w not in custom_stops and w.isalpha()])
lemmatizer = WordNetLemmatizer()
#generate part of speech tags
word_pos = pos_tag(word_tokenize(words_fin))
string_l = ''
#identify verbs (required for lemmatizer to lemmatize verbs) and lemmatize
for i in range (0, len(word_pos)):
if "V" in word_pos[i][1][0]:
pos = 'v'
else:
pos = 'n'
lem = lemmatizer.lemmatize(word_pos[i][0], pos)
string_l += (lem) + ' '
#send processed text to WordCloud
wordcloud = WordCloud(font_path = '/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf', stopwords=STOPWORDS, prefer_horizontal=0.9, max_words= 50).generate(string_l)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#resources:
#wordcloud: https://github.com/amueller/word_cloud and http://spartanideas.msu.edu/2014/11/28/turn-your-twitter-timeline-into-a-word-cloud-using-python
#randomly selecting from dataframe: http://stackoverflow.com/questions/12190874/pandas-sampling-a-dataframe
In [37]:
mdb = pymongo.MongoClient('mongodb://10.208.160.157')
db = mdb.ferguson
aug_tweets = db.tweets_aug
tweet_fields = ['text' ]
tweets = aug_tweets.find(fields = tweet_fields)
tweets_aug = pd.DataFrame(list(tweets), columns = tweet_fields)
makeWordCloud(tweets_aug, 1000, 50)
In [30]:
mdb = pymongo.MongoClient('mongodb://10.208.160.157')
db = mdb.ferguson
nov_tweets = db.tweets
from datetime import datetime
start = datetime(2014, 11, 25)
end = datetime(2014, 12, 2)
tweet_fields = ['text' ]
tweets = nov_tweets.find({"_iso_created_at": {"$gte": start, "$lte": end}}, fields = tweet_fields)
tweets_nov = pd.DataFrame(list(tweets), columns = tweet_fields)
makeWordCloud(tweets_nov, 1000, 50)
In [21]:
def makeWordCloudNoPolice(df, number_tweets, max_words):
#take random sample of dataframe based on user's input (number_tweets)
rows = np.random.choice(df.index.values, number_tweets, replace=False)
sampled_df = df.ix[rows]
words = ' '.join(sampled_df['text'])
#tokenize into words, take out words less than length 3, stop words, custom stop words, and numbers
tokens = word_tokenize(words)
english_stops = set(stopwords.words('english'))
custom_stops = ['http', 'https', 'amp', 'ferguson', 'via', 'police']
tokens = [w.lower() for w in tokens]
words_fin = " ".join([w for w in tokens if len(w) > 2 and w not in english_stops and w not in custom_stops and w.isalpha()])
lemmatizer = WordNetLemmatizer()
#generate part of speech tags
word_pos = pos_tag(word_tokenize(words_fin))
string_l = ''
#identify verbs (required for lemmatizer to lemmatize verbs) and lemmatize
for i in range (0, len(word_pos)):
if "V" in word_pos[i][1][0]:
pos = 'v'
else:
pos = 'n'
lem = lemmatizer.lemmatize(word_pos[i][0], pos)
string_l += (lem) + ' '
#send processed text to WordCloud
wordcloud = WordCloud(font_path = '/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf', stopwords=STOPWORDS, prefer_horizontal=0.9, max_words= 50).generate(string_l)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#resources: https://github.com/amueller/word_cloud and http://spartanideas.msu.edu/2014/11/28/turn-your-twitter-timeline-into-a-word-cloud-using-python
In [40]:
mdb = pymongo.MongoClient('mongodb://10.208.160.157')
db = mdb.ferguson
aug_tweets = db.tweets_aug
tweet_fields = ['text' ]
tweets = aug_tweets.find(fields = tweet_fields)
tweets_aug = pd.DataFrame(list(tweets), columns = tweet_fields)
#tweets_a = pd.read_csv('/home/data/august_reduced.csv')
makeWordCloudNoPolice(tweets_aug, 1000, 50)
In [28]:
mdb = pymongo.MongoClient('mongodb://10.208.160.157')
db = mdb.ferguson
nov_tweets = db.tweets
from datetime import datetime
start = datetime(2014, 11, 25)
end = datetime(2014, 12, 2)
tweet_fields = ['text' ]
tweets = nov_tweets.find({"_iso_created_at": {"$gte": start, "$lte": end}}, fields = tweet_fields)
tweets_nov = pd.DataFrame(list(tweets), columns = tweet_fields)
#tweets_n = pd.read_csv('nov_reduced.csv')
makeWordCloudNoPolice(tweets_nov, 1000, 50)